package mpo.serve.processer;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class News163Processer implements PageProcessor {

	@SuppressWarnings("deprecation")
	private Site site = Site.me().setDomain("news.163.com").addStartUrl("http://news.163.com").setCharset("gb2312");

	public void process(Page page) {
		String urlRegex = "http://news.163.com/\\d+/\\d+/\\d+/\\w+.html";
		List<String> links = page.getHtml().links().regex(urlRegex).all();
		if (!page.getRequest().getUrl().matches(urlRegex)) {
			page.addTargetRequests(links);
		}
		if (page.getUrl().toString().matches(urlRegex)) {
			page.putField("url", page.getUrl());
			page.putField("title", page.getHtml().xpath("//h1[@id=h1title]/text()"));
			page.putField("keywords", page.getHtml().xpath("//meta[@name=keywords]/@content"));
			page.putField("description", page.getHtml().xpath("//meta[@name=description]/@content"));
			page.putField("content", new Html(page.getHtml().$("#endText").toString()).smartContent());

			// page.putField("date",
			// page.getHtml().xpath("//span[@id=pub_date]/text()"));
			// page.putField("media",
			// page.getHtml().xpath("//span[@id=media_name]/*/text()"));
			page.putField("source", "网易");
		}
	}

	public Site getSite() {
		return site;
	}

}
